0. Read files and check the basic info for each file

library(ggplot2)
library(HardyWeinberg)
## Loading required package: mice
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Loading required package: Rsolnp
library(ggfortify)
library(MASS)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:HardyWeinberg':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
library("gridExtra")
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
geno <- read.csv('./data_files/genotypes.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1)   ## 344 50000
pheno <- read.csv('./data_files/phenotypes.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1)   ## 344   5
covar <- read.csv('./data_files/covars.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1)   ## 344   2

## Check the rownames of geno, pheno and covar files are consistent
sum(rownames(geno) == rownames(pheno))   ## 344
## [1] 344
sum(rownames(geno) == rownames(covar))   ## 344
## [1] 344

calculate the number of samples n and the numebr of SNPs N

n = dim(geno)[1]
N = dim(geno)[2]

make sure the genotype file looks good

geno[1:6,1:6]
##         rs10399749 rs62641299 rs115523412 rs75932129 rs10900604 rs58686784
## HG00096          0          2           0          1          1          1
## HG00097          0          2           0          2          0          1
## HG00099          1          1           0          0          1          1
## HG00100          0          2           0          2          0          0
## HG00101          1          2           0          1          1          0
## HG00102          0          2           0          1          0          0

make sure the phenotype file looks good

head(pheno)
##         ENSG00000164308.12 ENSG00000124587.9 ENSG00000180185.7
## HG00096          -1.166337       -0.68593575        -0.3062421
## HG00097          -1.045803       -0.17895757        -1.0333305
## HG00099           1.045803       -0.89457468        -2.5242603
## HG00100           0.223446       -0.09824326         1.1521112
## HG00101          -0.223446        0.86251206         1.3782830
## HG00102          -1.457684       -0.32916763        -0.1863454
##         ENSG00000168827.9 ENSG00000136536.9
## HG00096         0.9275844        0.36000876
## HG00097        -0.6676644       -1.77825296
## HG00099        -2.7590424       -2.52426034
## HG00100         0.2533471        0.05451891
## HG00101         1.0842344        0.26085685
## HG00102        -1.1663369       -2.37832893

make sure the covar file looks good

head(covar)
##         Population    Sex
## HG00096        GBR   MALE
## HG00097        GBR FEMALE
## HG00099        GBR FEMALE
## HG00100        GBR FEMALE
## HG00101        GBR   MALE
## HG00102        GBR FEMALE

The gene name for the genes measured in the phenotype file

symbol <- c('ERAP2', 'PEX6', 'FAHD1', 'GFM1', 'MARCH7')

1. Check the distribution of the phenotype file

1.1 check the distribution of relative gene expression for each gene

for (i in 1:dim(pheno)[2]){
  p <- ggplot(pheno, aes(x=pheno[,i])) + geom_density()
  p <- p + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i]))
  p <- p + xlab(paste0(symbol[i], '_relative expression'))
  plot(p)
  ggsave(paste0('./gene',i,'_expr_density.pdf'), width=4, height=3)
}

1.2 Check the distribution of genotype conditional on population structure

1.2.1 Generate table for sample number of each population/gender

geno_covar <- merge(geno, covar, by = 'row.names')
rownames(geno_covar) <- geno_covar$Row.names
geno_covar <- geno_covar[,-1]
geno_covar$pop_sex <- interaction(geno_covar$Population, geno_covar$Sex)
table(geno_covar$pop_sex)
## 
## CEU.FEMALE FIN.FEMALE GBR.FEMALE TSI.FEMALE   CEU.MALE   FIN.MALE 
##         37         55         46         43         41         34 
##   GBR.MALE   TSI.MALE 
##         39         49
pheno_covar <- merge(pheno, covar, by = 'row.names')
rownames(pheno_covar) <- pheno_covar$Row.names
pheno_covar <- pheno_covar[,-1]
pheno_covar$pop_sex <- interaction(pheno_covar$Population, pheno_covar$Sex)
table(pheno_covar$pop_sex)
## 
## CEU.FEMALE FIN.FEMALE GBR.FEMALE TSI.FEMALE   CEU.MALE   FIN.MALE 
##         37         55         46         43         41         34 
##   GBR.MALE   TSI.MALE 
##         39         49

1.2.2 PCA plot for genotype distribution conditional on population structure

df <- geno_covar[,c(1:50000)]
autoplot(prcomp(df), data = geno_covar, colour = 'Population', main = 'PCA plot_genotype -- population')

ggsave(paste0('./PCA plot_genotype -- population.pdf'), width=6, height=4)
autoplot(prcomp(df), data = geno_covar, colour = 'Sex', main = 'PCA plot_genotype -- gender')

ggsave(paste0('./PCA plot_genotype -- gender.pdf'), width=6, height=4)
autoplot(prcomp(df), data = geno_covar, colour = 'pop_sex', main = 'PCA plot_genotype -- population & gender')

ggsave(paste0('./PCA plot_genotype -- population & gender.pdf'), width=6, height=4)
#pca.result <- prcomp(geno)
#pca.result$sdev
#summary(pca.result)
#pcaDf <- data.frame(pc1=pca.result$x[,1], pc2=pca.result$x[,2])
#ggplot(pcaDf,aes(pc1,pc2)) + geom_point() + ggtitle("Genotypes with Population Structure")

1.3 Check the distribution of the phenotype conditional on population structure

1.3.1 Density plot of phenotype data conditional on population/gender

for (i in 1:dim(pheno)[2]){
  p1 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=Population)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
  p1 <- p1 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by population'))
  plot(p1)
  ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by population.pdf'), width=5, height=4)
  
  p2 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=Sex)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
  p2 <- p2 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by gender'))
  plot(p2)
  ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by gender.pdf'), width=5, height=4)
  
  p3 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=pop_sex)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
  p3 <- p3 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by population & gender'))
  plot(p3)
  ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by population and gender.pdf'), width=5, height=4)
}

1.3.2 PCA plot of phenotype data conditional on population/gender

df <- pheno_covar[,c(1:5)]
autoplot(prcomp(df), data = pheno_covar, colour = 'Population', main = 'PCA plot_phenotype -- population')

ggsave(paste0('./PCA plot_phenotype -- population.pdf'), width=6, height=4)
autoplot(prcomp(df), data = pheno_covar, colour = 'Sex', main = 'PCA plot_pheno -- gender')

ggsave(paste0('./PCA plot_phenotype -- gender.pdf'), width=6, height=4)
autoplot(prcomp(df), data = pheno_covar, colour = 'pop_sex', main = 'PCA plot_pheno -- population & gender')

ggsave(paste0('./PCA plot_phenotype -- population & gender.pdf'), width=6, height=4)

1.4 Check minor allele frequency

maf_cal <- function(geno_col){
  x = sum(geno_col)/(2*length(geno_col))
  if (x >= 0.5){
    return(1-x)
  } else if (x < 0.5){
    return(x)
  }
}

maf <- apply(geno, 2, maf_cal)
sum(maf > 0.05)
## [1] 50000
p <- ggplot() + geom_histogram(aes(x = maf), color = 'black', fill = "darkblue", bins = 100)
p <- p + xlab('minor allele frequency')
p <- p + ylab('counts')
p <- p + ggtitle('Histogram for the minor allele frequency')
p

ggsave(paste0('./MAF.pdf'), width=6, height=4)

2. Linear regression model

xa_converter <- function(x){
  if (x == 0){
    return(-1)
  } else if (x == 1) {
    return(0)
  } else if (x == 2){
    return(1)
  } 
}

xd_converter <- function(x){
  if (x == 0 | x == 2){
    return(-1)
  } else if (x == 1){
    return(1)
  }
}

Xa <- matrix(0, nrow = dim(geno)[1], ncol = dim(geno)[2])
Xd <- matrix(0, nrow = dim(geno)[1], ncol = dim(geno)[2])

for (i in c(1:N)){
  if (sum(geno[,i]/(2*n)) >= 0.5) {
    Xa[,i] <- sapply(geno[,i], xa_converter)
  } else if (sum(geno[,i]/(2*n)) < 0.5) {
    Xa[,i] <- sapply(geno[,i], xa_converter)
    Xa[,i] <- Xa[,i] * (-1)
  }
}

Xd <- apply(geno, 1:2, xd_converter)
colnames(Xd) <- NULL
rownames(Xd) <- NULL
pval_F_statistic <- function(Xa, Xd, y) {
  output <- as.numeric()
  for (i in 1:N){
    x <- matrix(c(matrix(1, nrow = n), Xa[,i], Xd[,i]), nrow = n)
    y <- matrix(y, nrow = n)
    MLE <- ginv(t(x)%*%x)%*%t(x)%*%y
    y_pred <- x%*%MLE
    SSM <- sum((y_pred-mean(y))^2)
    DFM <- 3-1
    SSE <- sum((y_pred-y)^2)
    DFE <- n-3
    MSM <- SSM/DFM
    MSE <- SSE/DFE
    F <- MSM/MSE
    output[i] <- pf(F, 2, n-3, lower.tail = FALSE)
  }
  return(output)
}
pval_mx1 <- matrix(0,nrow = dim(geno)[2], ncol = dim(pheno)[2])
pval_df1 <- as.data.frame(pval_mx1)
rownames(pval_df1) <- colnames(geno)
colnames(pval_df1) <- colnames(pheno)
for (i in 1:dim(pheno)[2]){
  pval_df1[,i] <- pval_F_statistic(Xa, Xd, pheno[,i])
}

p-value heatmap

autoplot(pval_mx1, main = 'p-value heatmap for linear regression model without covariates')
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

ggsave(paste0('./p-value heatmap_1.pdf'), width=6, height=4)

2.1 Manhattan plot

snp <- read.csv('./data_files/SNP_info.csv', header = TRUE, stringsAsFactors = FALSE)
sum(snp$id == colnames(geno))
## [1] 50000
sum(snp$id == rownames(pval_df1))
## [1] 50000
rownames(snp) <- snp$id
pval_df1 <- merge(pval_df1, snp, by = 'row.names', sort = FALSE)
rownames(pval_df1) <- pval_df1$Row.names
pval_df1 <- pval_df1[,-1]
pval_df1$chr <- as.factor(paste0('chr',pval_df1$chromosome))
pval_df1$locus = c(1:N)
for (i in 1:5){
  p <- ggplot(data = pval_df1) + geom_point(aes(x = locus, y = -log10(pval_df1[,i]), color = chr), size = 1)
  p <- p + xlab("locus")
  p <- p + ylab("-log10(p-value)")
  p <- p + ggtitle(paste0('Manhattan Plot for gene',i,'_' , symbol[i]))
  p <- p + geom_hline(yintercept = -log10(0.05/N), color = 'red')
  plot(p)
  ggsave(paste0('./gene',i,'_', symbol[i],'_manhattan_plot.pdf'), width=12, height=3)
}

compute the potential causal SNPs for each gene

gene1_sig <- rownames(pval_df1)[which(pval_df1[,1] < 0.05/N)]   ##ENSG00000164308.12_ERAP2
gene1_sig_idx <- which(pval_df1[,1] < 0.05/N)
length(gene1_sig)
## [1] 73
gene2_sig <- rownames(pval_df1)[which(pval_df1[,2] < 0.05/N)]   ## ENSG00000124587.9_PEX6
gene2_sig_idx <- which(pval_df1[,2] < 0.05/N)
length(gene2_sig)
## [1] 29
gene3_sig <- rownames(pval_df1)[which(pval_df1[,3] < 0.05/N)]   ## ENSG00000180185.7_FAHD1
gene3_sig_idx <- which(pval_df1[,3] < 0.05/N)
length(gene3_sig)
## [1] 90
gene4_sig <- rownames(pval_df1)[which(pval_df1[,4] < 0.05/N)]   ## ENSG00000168827.9_GFM1
gene4_sig_idx <- which(pval_df1[,4] < 0.05/N)
length(gene4_sig)
## [1] 0
gene5_sig <- rownames(pval_df1)[which(pval_df1[,5] < 0.05/N)]   ## ENSG00000136536.9_MARCH7
gene5_sig_idx <- which(pval_df1[,5] < 0.05/N)
length(gene5_sig)
## [1] 0

2.2 QQ-plot

#pval_df1$expected_pval = sort(-log10(seq(0,1,length.out = N)))
expected_pval = sort(-log10(seq(0,1,length.out = N)))
for (i in 1:dim(pheno)[2]){
  p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df1[,i]))))
  p <- p + xlab("-log10(expected p-values)")
  p <- p + ylab("-log10(observed p-values)")
  p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i]))
  p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
  plot(p)
  ggsave(paste0('./gene',i,'_',symbol[i],'_QQ_plot.pdf'), width=5, height=4)
}

for (i in 1:3){
  p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df1[,i]))))
  p <- p + xlab("-log10(expected p-values)")
  p <- p + ylab("-log10(observed p-values)")
  p <- p + ggtitle(paste0("QQ plot for gene",i,'_', symbol[i]))
  p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
  p <- p + ylim(c(0,10))
  plot(p)
  ggsave(paste0('./gene',i,'_', symbol[i], '_QQ_plot_2.pdf'), width=5, height=4)
}
## Warning: Removed 54 rows containing missing values (geom_point).

## Warning: Removed 54 rows containing missing values (geom_point).

## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).

## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).

3. Linear regression model and include covariates to model the population structure and gender

pval_calculator_lab10 <- function(pheno_input, xa_input, xd_input, z_input){
    n_samples <- dim(Xa)[1]
    
    # Set up random variables for null (Z_mx) and with genotypes (XZ_mx)
    Z_mx <- cbind(1,z_input)                                          # H0 (w/ covariates)
    XZ_mx <- cbind(1,xa_input,xd_input,z_input)                       # w/ genotypes too
    
    # Calculate MLE betas for both null model and model with genotypes and covariates
    MLE_beta_theta0 <- ginv(t(Z_mx)  %*% Z_mx)  %*% t(Z_mx)  %*% pheno_input 
    MLE_beta_theta1 <- ginv(t(XZ_mx) %*% XZ_mx) %*% t(XZ_mx) %*% pheno_input
    
    # Get Y estimates using the betas calculated above to give each hypothesis its best chance
    y_hat_theta0 <- Z_mx  %*% MLE_beta_theta0
    y_hat_theta1 <- XZ_mx %*% MLE_beta_theta1
    
    # Get the variance between the true phenotype values and our estimates under each hypothesis
    SSE_theta0 <- sum((pheno_input - y_hat_theta0)^2)
    SSE_theta1 <- sum((pheno_input - y_hat_theta1)^2)
    
    # Set degrees of freedom
    df_M <- 2
    df_E <- n_samples - 3
    
    # Put together calculated terms to get Fstatistic
    Fstatistic <- ((SSE_theta0-SSE_theta1)/df_M) / (SSE_theta1/df_E)
    
    # Determine pval of the Fstatistic
    pval <- pf(Fstatistic, df_M, df_E,lower.tail = FALSE)
    return(pval)
}
covar$GBR <- ifelse(covar$Population == 'GBR',1,0)
covar$FIN <- ifelse(covar$Population == 'FIN',1,0)
covar$CEU <- ifelse(covar$Population == 'CEU',1,0)
covar$TSI <- ifelse(covar$Population == 'TSI',1,0)
covar$MALE <- ifelse(covar$Sex == 'MALE',1,0)
covar$FEMALE <- ifelse(covar$Sex == 'FEMALE',1,0)
Xz = as.matrix(covar[,c(3:8)])
pval_mx2 <- matrix(0,nrow = dim(geno)[2], ncol = dim(pheno)[2])
pval_df2 <- as.data.frame(pval_mx2)
rownames(pval_df2) <- colnames(geno)
colnames(pval_df2) <- colnames(pheno)
for (i in 1:dim(pheno)[2]){
  for (j in 1:dim(geno)[2]){
    pval <- pval_calculator_lab10(pheno[,i], Xa[,j], Xd[,j], Xz)
    pval_df2[j,i] <- pval
  }
}
autoplot(pval_mx2, main = 'p-value heatmap for linear regression model with covariates')
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

ggsave(paste0('./p-value heatmap_2.pdf'), width=6, height=4)

3.1 Manhattan plot

rownames(pval_df2) <- colnames(geno)
pval_df2 <- merge(pval_df2, snp, by = 'row.names', sort = FALSE)
rownames(pval_df2) <- pval_df2$Row.names
pval_df2 <- pval_df2[,-1]
pval_df2$chr <- as.factor(paste0('chr',pval_df2$chromosome))
pval_df2$locus = c(1:N)
for (i in 1:dim(pheno)[2]){
  p <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,i]), color = chr))
  p <- p + xlab("locus")
  p <- p + ylab("-log10(p-value)")
  p <- p + ggtitle(paste0('Manhattan Plot for gene',i,'_' , symbol[i], ' after modeling covariates'))
  p <- p + geom_hline(yintercept = -log10(0.05/N), color = 'red')
  plot(p)
  ggsave(paste0('./gene',i,'_', symbol[i],'_manhattan_plot_covar.pdf'), width=12, height=3)
}

compute the potential causal SNPs for each gene

gene1_sig_covar <- rownames(pval_df2)[which(pval_df2[,1] < 0.05/N)]   ##ENSG00000164308.12_ERAP2
gene1_sig_idx_covar <- which(pval_df2[,1] < 0.05/N)
length(gene1_sig_covar)
## [1] 74
gene2_sig_covar <- rownames(pval_df2)[which(pval_df2[,2] < 0.05/N)]   ## ENSG00000124587.9_PEX6
gene2_sig_idx_covar <- which(pval_df2[,2] < 0.05/N)
length(gene2_sig_covar)
## [1] 27
gene3_sig_covar <- rownames(pval_df2)[which(pval_df2[,3] < 0.05/N)]   ## ENSG00000180185.7_FAHD1
gene3_sig_idx_covar <- which(pval_df2[,3] < 0.05/N)
length(gene3_sig_covar)
## [1] 90
gene4_sig_covar <- rownames(pval_df2)[which(pval_df2[,4] < 0.05/N)]   ## ENSG00000168827.9_GFM1
gene4_sig_idx_covar <- which(pval_df2[,4] < 0.05/N)
length(gene4_sig_covar)
## [1] 0
gene5_sig_covar <- rownames(pval_df2)[which(pval_df2[,5] < 0.05/N)]   ## ENSG00000136536.9_MARCH7
gene5_sig_idx_covar <- which(pval_df2[,5] < 0.05/N)
length(gene5_sig_covar)
## [1] 0

3.2 QQ-plot

expected_pval = sort(-log10(seq(0,1,length.out = N)))
for (i in 1:dim(pheno)[2]){
  p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df2[,i]))))
  p <- p + xlab("-log10(expected p-values)")
  p <- p + ylab("-log10(observed p-values)")
  p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i], ' after modeling covariates'))
  p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
  plot(p)
  ggsave(paste0('./gene',i,'_', symbol[i],'_QQ_plot_covar.pdf'), width=5, height=4)
}

for (i in 1:3){
  p <- ggplot(data = pval_df2) + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df2[,i]))))
  p <- p + xlab("-log10(expected p-values)")
  p <- p + ylab("-log10(observed p-values)")
  p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i], ' after modeling covariates'))
  p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
  p <- p + ylim(c(0,10))
  plot(p)
  ggsave(paste0('./gene',i, '_', symbol[i], '_QQ_plot_covar_2.pdf'), width=5, height=4)
}
## Warning: Removed 53 rows containing missing values (geom_point).

## Warning: Removed 53 rows containing missing values (geom_point).

## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).

## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).

4. Further analyze the potential causal SNPs obtained from the linear regression model

4.1 Hardy-Weinberg equilibrium test

geno_HW <- as.numeric()
Xa_1 <- Xa
add_mx <- rbind(rep(-1,N), rep(0,N), rep(1,N))
Xa_1 <- rbind(Xa_1, add_mx)
Xa_1_summary <- apply(Xa_1, 2, table)   ## 14  59 271
allele_mx <- matrix((unlist(Xa_1_summary)-1), nrow=dim(geno)[2], byrow=T)
colnames(allele_mx) <- c('AA', 'AB', 'BB')
Exact.pvalues <- HWExactStats(allele_mx,x.linked=FALSE)
sum(Exact.pvalues < 0.05)
## [1] 3436
idx <- which(Exact.pvalues > 0.05)
gene1_sig_HW <- intersect(gene1_sig_covar, colnames(geno)[idx])
length(gene1_sig_HW)
## [1] 67
gene1_sig_HW_idx <- intersect(gene1_sig_idx_covar, idx)
gene2_sig_HW <- intersect(gene2_sig_covar, colnames(geno)[idx])
length(gene2_sig_HW)
## [1] 26
gene2_sig_HW_idx <- intersect(gene2_sig_idx_covar, idx)
gene3_sig_HW <- intersect(gene3_sig_covar, colnames(geno)[idx])
length(gene3_sig_HW)
## [1] 82
gene3_sig_HW_idx <- intersect(gene3_sig_idx_covar, idx)

4.2 Check the localization of the potential causal SNPs for each phenotype – Local Manhattan plot

pval_df2$gene1_sig <- ifelse(rownames(pval_df2)%in%gene1_sig_HW, 'sig', 'nosig')
pval_df2$gene2_sig <- ifelse(rownames(pval_df2)%in%gene2_sig_HW, 'sig', 'nosig')
pval_df2$gene3_sig <- ifelse(rownames(pval_df2)%in%gene3_sig_HW, 'sig', 'nosig')

4.2.1 gene1_ERAP2

cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene1_sig <- pval_df2[which(pval_df2$gene1_sig == 'sig'),]
unique(pval_df2_gene1_sig$chromosome)   ## chr5
## [1] 5
p1 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,1]), color = gene1_sig))
p1 <- p1 + scale_colour_manual(values = cols)
p1 <- p1 + xlab("locus") + ylab("-log10(p-value)")
p1 <- p1 + ggtitle('Manhattan Plot for gene1_ERAP2_local')
p1 <- p1 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p1 <- p1 + xlim(range(gene1_sig_HW_idx)[1]-100, range(gene1_sig_HW_idx)[2]+100)
plot(p1)
## Warning: Removed 49718 rows containing missing values (geom_point).

ggsave('./gene1_ERAP2_manhattan_plot_local.pdf', width=5, height=4)
## Warning: Removed 49718 rows containing missing values (geom_point).
gene1_sig_chr <- unique(pval_df2_gene1_sig$chromosome)
gene1_sig_loc <- pval_df2_gene1_sig$position
range(gene1_sig_loc)
## [1] 96772432 97110808
gene1_sig_loc_len <- range(gene1_sig_loc)[2] - range(gene1_sig_loc)[1]
gene1_sig_loc_len
## [1] 338376
print(paste0('The potential ', length(gene1_sig_loc), ' causal SNPs for gene1_ERAP2 are located on chromosome ', gene1_sig_chr, ', spans from ', range(gene1_sig_loc)[1], ' to ', range(gene1_sig_loc)[2], ', covers ', gene1_sig_loc_len, ' bp'))
## [1] "The potential 67 causal SNPs for gene1_ERAP2 are located on chromosome 5, spans from 96772432 to 97110808, covers 338376 bp"

4.2.2 gene1_PEX6

cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene2_sig <- pval_df2[which(pval_df2$gene2_sig == 'sig'),]
unique(pval_df2_gene2_sig$chromosome)   ## chr4 & chr6  ## chr4:"rs6854915"
## [1] 4 6
p2 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,2]), color = gene2_sig))
p2 <- p2 + scale_colour_manual(values = cols)
p2 <- p2 + xlab("locus") + ylab("-log10(p-value)")
p2 <- p2 + ggtitle('Manhattan Plot for gene2_PEX6_local')
p2 <- p2 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p2 <- p2 + xlim(range(gene2_sig_HW_idx)[1]-100, range(gene2_sig_HW_idx)[2]+100)
plot(p2)
## Warning: Removed 43877 rows containing missing values (geom_point).

ggsave('./gene2_PEX6_manhattan_plot_zoom_in.pdf', width=5, height=4)
## Warning: Removed 43877 rows containing missing values (geom_point).
cols <- c('sig' = 'red', 'nosig' = "black")
p2 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,2]), color = gene2_sig))
p2 <- p2 + scale_colour_manual(values = cols)
p2 <- p2 + xlab("locus") + ylab("-log10(p-value)")
p2 <- p2 + ggtitle('Manhattan Plot for gene2_PEX6_local_2')
p2 <- p2 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p2 <- p2 + xlim(range(gene2_sig_HW_idx[2:26])[1]-100, range(gene2_sig_HW_idx[2:26])[2]+100)
plot(p2)
## Warning: Removed 49772 rows containing missing values (geom_point).

ggsave('./gene2_PEX6_manhattan_plot_zoom_in_2.pdf', width=5, height=4)
## Warning: Removed 49772 rows containing missing values (geom_point).
gene2_sig_chr <- unique(pval_df2_gene2_sig[2:26,]$chromosome)
gene2_sig_loc <- pval_df2_gene2_sig[c(2:26),]$position
range(gene2_sig_loc)
## [1] 42889467 43108015
gene2_sig_loc_len <- range(gene2_sig_loc)[2] - range(gene2_sig_loc)[1]
gene2_sig_loc_len
## [1] 218548
print(paste0('The potential ', length(gene2_sig_loc), ' causal SNPs for gene2_PEX6 are located on chromosome ', gene2_sig_chr, ', spans from ', range(gene2_sig_loc)[1], ' to ', range(gene2_sig_loc)[2], ', covers ', gene2_sig_loc_len, ' bp'))
## [1] "The potential 25 causal SNPs for gene2_PEX6 are located on chromosome 6, spans from 42889467 to 43108015, covers 218548 bp"

4.2.3 gene1_FADH1

cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene3_sig <- pval_df2[which(pval_df2$gene3_sig == 'sig'),]
unique(pval_df2_gene3_sig$chromosome)   ## chr16
## [1] 16
p3 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,3]), color = gene3_sig))
p3 <- p3 + scale_colour_manual(values = cols)
p3 <- p3 + xlab("locus") + ylab("-log10(p-value)")
p3 <- p3 + ggtitle('Manhattan Plot for gene3_FAHD1_local')
p3 <- p3 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p3 <- p3 + xlim(range(gene3_sig_HW_idx)[1]-100, range(gene3_sig_HW_idx)[2]+100)
plot(p3)
## Warning: Removed 49700 rows containing missing values (geom_point).

ggsave('./gene3_FAHD1_manhattan_plot_local.pdf', width=5, height=4)
## Warning: Removed 49700 rows containing missing values (geom_point).
gene3_sig_chr <- unique(pval_df2_gene3_sig$chromosome)
gene3_sig_loc <- pval_df2_gene3_sig$position
range(gene3_sig_loc)
## [1] 1524250 1929366
gene3_sig_loc_len <- range(gene3_sig_loc)[2] - range(gene3_sig_loc)[1]
gene3_sig_loc_len
## [1] 405116
print(paste0('The potential ', length(gene3_sig_loc), ' causal SNPs for gene3_FAHD1 are located on chromosome ', gene3_sig_chr, ', spans from ', range(gene3_sig_loc)[1], ' to ', range(gene3_sig_loc)[2], ', covers ', gene3_sig_loc_len, ' bp'))
## [1] "The potential 82 causal SNPs for gene3_FAHD1 are located on chromosome 16, spans from 1524250 to 1929366, covers 405116 bp"

4.3 LD heatmap

LD_plot <- function(index, gene){
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
  
  index1 <- range(index)[1]-50
  index2 <- range(index)[2]+50
  Xa_100 <- Xa[,c(index1:index2)]
  num <- dim(Xa_100)[2]
  corr_mx = matrix(0, nrow = num, ncol = num)
  for (i in 1:num){
    for (j in 1:num){
      cor <- cor.test(Xa_100[,i], Xa_100[,j])
      corr_mx[i,j] <- (cor$estimate)^2}}
  
  upper_tri <- get_upper_tri(corr_mx)
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  
  p <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile()+ scale_fill_gradient2(low = "white", high = "red", limit = c(0,1), space = "Lab",name="r^2")
  p <- p + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1), axis.title.x = element_blank(), axis.title.y = element_blank(), panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank())+ coord_fixed() + geom_segment(aes(x = 50, y = 50, xend = num-50, yend = 50), size = 0.5) + geom_segment(aes(x = num-50, y = 50, xend = num-50, yend = num-50), size = 0.5)
  p <- p + ggtitle(paste0('Correlation matrix for gene', gene, '_', symbol[gene]))
  ggsave(paste0('./correlation matrix for gene',gene,'_', symbol[gene],'.pdf'), width=5, height=4)
  plot(p)
  return(corr_mx)
}

4.3.1 gene1_ERAP2

corr_mx1 <- LD_plot(gene1_sig_HW_idx,1)

2 LD regions 1st covers 20 SNPs and 2nd covers 47 SNPs

rownames(pval_df2_gene1_sig)[1]   ## "rs200641494"
## [1] "rs200641494"
pval_df2_gene1_sig$position[1]   ## 96772432
## [1] 96772432
rownames(pval_df2_gene1_sig)[20]   ## "rs70981851"
## [1] "rs70981851"
pval_df2_gene1_sig$position[20]   ## 96848898
## [1] 96848898
rownames(pval_df2_gene1_sig)[21]   ## "rs2549778"
## [1] "rs2549778"
pval_df2_gene1_sig$position[21]   ## 96868550
## [1] 96868550
rownames(pval_df2_gene1_sig)[67]   ## "rs72777610"
## [1] "rs72777610"
pval_df2_gene1_sig$position[67]   ## 97110808
## [1] 97110808
gene1_sig_SNP <- rownames(pval_df2_gene1_sig)[21:67]
print('47 SNPs are identifed to be potential causal SNP for ERAP2')
## [1] "47 SNPs are identifed to be potential causal SNP for ERAP2"
gene1_sig_SNP
##  [1] "rs2549778"   "rs3840523"   "rs1230363"   "rs12189125"  "rs12653964" 
##  [6] "rs1559354"   "rs201477313" "rs201109800" "rs4869314"   "rs10707238" 
## [11] "rs2549784"   "rs2161657"   "rs6859160"   "rs251339"    "rs1423568"  
## [16] "rs2549791"   "rs2548525"   "rs13163165"  "rs2549801"   "rs2910688"  
## [21] "rs1363907"   "rs1230382"   "rs1216571"   "rs1216568"   "rs1046395"  
## [26] "rs2548226"   "rs6887500"   "rs7726445"   "rs1477364"   "rs7731592"  
## [31] "rs7723899"   "rs7716222"   "rs10058476"  "rs140336797" "rs2161548"  
## [36] "rs6879678"   "rs201866654" "rs9918183"   "rs1160962"   "rs199514005"
## [41] "rs27306"     "rs27307"     "rs11414909"  "rs27292"     "rs27747"    
## [46] "rs248215"    "rs72777610"

4.3.2 gene1_PEX6

corr_mx2 <- LD_plot(gene2_sig_HW_idx[2:26],2)

2 LD regions 1st covers 13 SNPs and 2nd covers 11 SNPs

pval_df2_gene2_sig$position[2]   ## 42889467
## [1] 42889467
pval_df2_gene2_sig$position[14]   ## 42946612
## [1] 42946612
pval_df2_gene2_sig$position[15]   ## 42949278
## [1] 42949278
pval_df2_gene2_sig$position[25]   ## 42977844
## [1] 42977844
pval_df2_gene2_sig$position[26]   ## 43108015
## [1] 43108015
gene2_sig_SNP <- rownames(pval_df2_gene2_sig)[15:25]
print('11 SNPs are identifed to be potential causal SNP for PEX6')
## [1] "11 SNPs are identifed to be potential causal SNP for PEX6"
gene2_sig_SNP
##  [1] "rs4714638"   "rs9471976"   "rs58497441"  "rs13215983"  "rs6920547"  
##  [6] "rs7760250"   "rs1129187"   "rs3805953"   "rs10948061"  "rs9471985"  
## [11] "rs199921136"

calculate the correlation between the potential causal SNP for gene2_PEX6 in chr4 and the remaining 25 SNPs in chr6

geno_gene2 <- geno[,gene2_sig_HW_idx]
corr_2c = c()
for (i in 1:26){
  cor <- cor.test(geno_gene2[,i], geno_gene2[,1])
  corr_2c <- c(corr_2c, (cor$estimate)^2)
}
corr_2c
##          cor          cor          cor          cor          cor 
## 1.0000000000 0.0011806704 0.0013166376 0.0008190977 0.0006665384 
##          cor          cor          cor          cor          cor 
## 0.0006665384 0.0006665384 0.0017531422 0.0017531422 0.0028664568 
##          cor          cor          cor          cor          cor 
## 0.0058871454 0.0097047428 0.0074004589 0.0061571783 0.0403803305 
##          cor          cor          cor          cor          cor 
## 0.0441912674 0.0423251303 0.0449502338 0.0410445826 0.0449502338 
##          cor          cor          cor          cor          cor 
## 0.0366164687 0.0449502338 0.0366164687 0.0457212415 0.0228560051 
##          cor 
## 0.0228664106

4.3.3 gene1_FAHD1

corr_mx3 <- LD_plot(gene3_sig_HW_idx,3)

gene3_sig_SNP <- rownames(pval_df2_gene3_sig)
print('82 SNPs are identifed to be potential causal SNP for FAHD1')
## [1] "82 SNPs are identifed to be potential causal SNP for FAHD1"
gene3_sig_SNP
##  [1] "rs2235639"   "rs75276217"  "rs7201813"   "rs1178435"   "rs2575352"  
##  [6] "rs1065663"   "rs3751893"   "rs2745205"   "rs2575345"   "rs2745195"  
## [11] "rs142797272" "rs145338194" "rs2473469"   "rs1657107"   "rs1629534"  
## [16] "rs7186249"   "rs8046750"   "rs9935687"   "rs9928566"   "rs2575357"  
## [21] "rs62038378"  "rs2076455"   "rs2268674"   "rs3760040"   "rs1657152"  
## [26] "rs28364708"  "rs1143034"   "rs11644748"  "rs8044343"   "rs34392705" 
## [31] "rs11641513"  "rs140254902" "rs12325082"  "rs9652776"   "rs11643835" 
## [36] "rs3813760"   "rs112311350" "rs141551352" "rs4598914"   "rs62038431" 
## [41] "rs62038440"  "rs57530073"  "rs138033584" "rs2575359"   "rs1742423"  
## [46] "rs1657118"   "rs1742432"   "rs410465"    "rs449530"    "rs388627"   
## [51] "rs433268"    "rs366223"    "rs2492879"   "rs2492881"   "rs391543"   
## [56] "rs150843657" "rs368188"    "rs1742446"   "rs4451947"   "rs2437747"  
## [61] "rs2575335"   "rs2369275"   "rs2815297"   "rs6600180"   "rs3952970"  
## [66] "rs1657100"   "rs2974857"   "rs2982447"   "rs2974860"   "rs2917517"  
## [71] "rs2906903"   "rs1632124"   "rs2982433"   "rs1657139"   "rs8059871"  
## [76] "rs740466"    "rs1742401"   "rs344364"    "rs344366"    "rs344369"   
## [81] "rs2575358"   "rs58530366"

5.Generate boxplot to view expression data conditional on each potential causal SNP

#gene1_sig_HW_idx
expr_plot <- function(gene, snp){
  data <- data.frame('gene1_expr' = pheno[,gene], 'geno' = as.factor(geno[,snp]))
  levels(data$geno) <- c('AA', 'AB', 'BB')
  rownames(data) <- rownames(pheno)
  p <- ggplot(data, aes(x = geno, y = gene1_expr, fill = geno)) + geom_boxplot() + geom_point(position=position_jitterdodge()) + ggtitle(paste0('Boxplot for expression of gene', gene, '_',symbol[gene],' at snp ', colnames(geno)[snp]))
  p <- p + xlab('Genotype')
  p <- p + ylab('Relative expression')
  #plot(p)
  ggsave(paste0('./gene',gene,'_',symbol[gene], ' at snp ', colnames(geno)[snp], '.pdf'), width=5, height=4)
}

#expr_plot(1,1)
for (i in 1:length(gene1_sig_HW_idx)){
  expr_plot(1, gene1_sig_HW_idx[i])
}
for (i in 1:length(gene2_sig_HW_idx)){
  expr_plot(2, gene2_sig_HW_idx[i])
}
for (i in 1:length(gene3_sig_HW_idx)){
  expr_plot(3, gene3_sig_HW_idx[i])
}